Introduction

Reads were first de-multiplexed with ea-utils (fastq-multx) with an allowed barcode mismatch of 1 (Default). De-multiplexed reads were then subjected to an alignment against the UniVec vector contamination database using the adapter removal tool within ea-utils (fastq-mcf). Barcodes were extracted by remove the last 12 nucleotides from each read.

Next reads were subjected to an alignment against the Oligo library using a DNA aligner. We used Bowtie2…

Data import & preprocessing

We are going to first import the merged counts tables that is outputted from the Nextflow pipeline. This table already has the raw counts summarised from the alignments and merged into a single matrix.

Import oligos

  • oligo_1899 (MART1_ELA)
  • oligo_4281 (MART1_ELA)
  • oligo_912 (MART1)
  • oligo_3294 (MART1)
  • oligo_1887 (CDK4mut)
  • oligo_4269 (CDK4mut)
  • oligo_1888 (CDK4wt)
  • oligo_4270 (CDK4wt)
  • oligo_1889 (RPS3Amut)
  • oligo_4271 (RPS3Amut)
  • oligo_1890 (RPS3Awt)
  • oligo_4272 (RPS3Awt)
  • oligo_1895 (MANSC1mut)
  • oligo_4277 (MANSC1mut)
  • oligo_1896 (MANSC1wt)
  • oligo_4278 (MANSC1wt)
# Import counts
cd8.counts = readRDS('../data/CD8-counts.rda')

# Convert to long format
cd8.counts.long = cd8.counts %>% 
  rownames_to_column("Id") %>% 
  pivot_longer(cols = -c('Id'), names_to = 'sample', values_to = 'counts')

# Import CD4 metadata
cd8.metadata = readRDS('../data/CD8-metadata.rda')

# Import MultiQC results
cd8.report = readRDS('../data/CD8-multiqc.rda')

# Import oligo annotations
oligo.genes = readRDS('../data/oligo-genes.rda')

Summary stats

# Mean
mean(cd8.report$raw_total_sequences)
## [1] 14162045
# Median
median(cd8.report$raw_total_sequences)
## [1] 13417612
# Range
range(cd8.report$raw_total_sequences)
## [1]  8434046 23626015

Mapped reads per sample

cd8.report %>% 
  select(Sample, reads_unmapped, reads_mapped) %>% 
  mutate(Sample = str_replace(Sample, '-bamstats', '')) %>% 
  pivot_longer(cols = c("reads_unmapped", "reads_mapped"), names_to = "feature", values_to = "reads") %>% 
  ggplot(aes(x = Sample, y = reads, fill = feature)) +
  geom_col() +
  coord_flip() +
  theme_light(base_size = 11) +
  theme(legend.position = "top") +
  scale_fill_aaas() +
  xlab("") +
  ylab("Number of reads")

cd8.report %>% 
  select(Sample, reads_unmapped_percent, reads_mapped_percent) %>% 
  mutate(Sample = str_replace(Sample, '-bamstats', '')) %>% 
  pivot_longer(cols = c("reads_unmapped_percent", "reads_mapped_percent"), names_to = "feature", values_to = "reads") %>% 
  mutate(reads = reads / 100) %>% 
  ggplot(aes(x = Sample, y = reads, fill = feature)) +
  geom_col() +
  coord_flip() +
  theme_light(base_size = 11) +
  theme(legend.position = "top") +
  scale_fill_aaas() +
  scale_y_continuous(labels = scales::percent) +
  xlab("") +
  ylab("Percent of reads")

Mock replicates

cd8.counts %>% 
  ggplot(., aes(x = log10(mock_a), y = log10(mock_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("Mock replicate #1 (log10)") +
  ylab("Mock replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) 

1D3 replicates

cd8.counts %>% 
  ggplot(aes(x = log10(id3_a), y = log10(id3_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("1D3 replicate #1 (log10)") +
  ylab("1D3 replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) 

CDK4-53 replicates

cd8.counts %>% 
  ggplot(aes(x = log10(cdk453_a), y = log10(cdk453_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("CDK4/53 replicate #1 (log10)") +
  ylab("CDK4/53 replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) 

Dilution 1:10 replicates

cd8.counts %>% 
  ggplot(aes(x = log10(d1_10_a), y = log10(d1_10_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("1:10 replicate #1 (log10)") +
  ylab("1:10 replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..))

Dilution 1:100 replicates

cd8.counts %>% 
  ggplot(aes(x = log10(d1_100_a), y = log10(d1_100_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("1:100 replicate #1 (log10)") +
  ylab("1:100 replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) 

Dilution 1:300 replicates

cd8.counts %>% 
  ggplot(aes(x = log10(d1_300_a), y = log10(d1_300_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("1:300 replicate #1 (log10)") +
  ylab("1:300 replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) 

Dilution 1:1000 replicates

cd8.counts %>% 
  ggplot(aes(x = log10(d1_1000_a), y = log10(d1_1000_b))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("1:1000 replicate #1 (log10)") +
  ylab("1:1000 replicate #2 (log10)") +
  theme_light(base_size = 11) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) 

Calculate oligo-rep variance

# Calculate the variance between oligo-replicates
oligo.variance = cd8.counts %>% 
  rownames_to_column("Id") %>%
  mutate_if(is.numeric, funs(log10)) %>% 
  left_join(oligo.genes, by = "Id") %>% 
  group_by(oligoGroup) %>% 
  summarise(mock_a = diff(mock_a),
            mock_b = diff(mock_b),
            cdk453_a = diff(cdk453_a),
            cdk453_b = diff(cdk453_b),
            id3_a = diff(id3_a),
            id3_b = diff(id3_b),
            d1_10_a = diff(d1_10_a),
            d1_10_b = diff(d1_10_b),
            d1_100_a = diff(d1_100_a),
            d1_100_b = diff(d1_100_b),
            d1_300_a = diff(d1_300_a),
            d1_300_b = diff(d1_300_b),
            d1_1000_a = diff(d1_1000_a),
            d1_1000_b = diff(d1_1000_b))

# Plot histogram of variance across samples
oligo.variance %>% 
  pivot_longer(-oligoGroup, names_to = "Sample", values_to = "difference") %>% 
  ggplot(aes(x = difference)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(.~Sample) +
  xlab("Difference (log10)")

Median-normalization

Prepare data for DESeq2 object

# Match the sample ID's between samples
cd8.counts = cd8.counts[,match(cd8.metadata$Sample, colnames(cd8.counts))]

# Make DEseq2 object
ddsMat.cd8 <- DESeqDataSetFromMatrix(countData = cd8.counts,
                                     colData = cd8.metadata,
                                     design = ~Group)
## converting counts to integer mode

Normalize

# Get normalize counts
ddsMat.cd8 <- estimateSizeFactors(ddsMat.cd8)
ddsMat.cd8 <- estimateDispersions(ddsMat.cd8, fitType='local')
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# Get normalized counts 
cd8.norm <- counts(ddsMat.cd8, normalized = TRUE) %>% 
  as.data.frame() %>% 
  rownames_to_column("Id")

# Write tables to disk
write_tsv(cd8.counts, "tables/cd8-counts.tsv")
write_tsv(cd8.norm, "tables/cd8-counts-norm.tsv")

# Create a long table 
cd8.norm.long = cd8.norm %>% 
  gather(sample, norm.counts, -Id) 

Plot boxplots

# Pre-normalized
cd8.counts.long %>% 
  ggplot(aes(x = sample, y = log10(counts + 1))) +
  geom_boxplot() +
  theme_light(base_size = 11) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("") +
  ylab("Counts (log10)") +
  ggtitle('Un-normalized data')

# After normalized
cd8.norm.long %>% 
  ggplot(aes(x = sample, y = log10(norm.counts + 1))) +
  geom_boxplot() +
  theme_light(base_size = 11) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("") +
  ylab("Normalized counts (log10)") +
  ggtitle('Median-normalized data')

Plot normalized histogram

cd8.norm.long %>% 
  ggplot(aes(x = log10(norm.counts))) +
  geom_histogram(aes(y=..density..), binwidth=0.25, colour="black", fill="white") +
  geom_density(alpha = 0.2, fill = "#FF6666") +
  facet_wrap(sample~.) +
  theme_light(base_size = 11) +
  ylab("Density") +
  xlab("Normalized reads per oligo (log10)") 

MART1/1D3 dropout

Oligo Id’s for the positive control dropout’s are: - oligo_1899 (MART1_ELA) - oligo_4281 (MART1_ELA) - oligo_912 (MART1) - oligo_3294 (MART1)

# Get average between the replicates
id3.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         ID3_mean = rowMeans(select(., id3_a, id3_b))) %>% 
  mutate(M = log2(ID3_mean / Mock_mean),
         A = (log2(ID3_mean) + log2(Mock_mean)) / 2) 

# Plot scatterplot
id3.mean %>% 
  ggplot(aes(x = log10(Mock_mean), y = log10(ID3_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(id3.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(id3.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(id3.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(id3.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("Mock normalized counts (log10)") +
  ylab("1D3 normalized counts (log10)")

# Plot MA plot
id3.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(id3.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(id3.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(id3.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(id3.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " 1D3/Mock)"))) 

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("mock_a", "mock_b", "id3_a", "id3_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "Mock")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Get results (ID3 / Mock)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4762 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 5, 0.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
id3.mock.res = results(ddsMat.subset, alpha = 0.05)

# View table
id3.mock.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##           Id baseMean log2FoldChange     lfcSE       stat        pvalue
## 1 oligo_1899 1616.221     -2.5925091 0.1668882 -15.534411  2.029294e-54
## 2 oligo_3295 2359.897     -1.1754412 0.1121242 -10.483386  1.029893e-25
## 3 oligo_4281 1750.376     -4.1132380 0.1543848 -26.642770 2.170495e-156
## 4 oligo_4374 1994.070     -0.5718696 0.1206310  -4.740650  2.130336e-06
## 5  oligo_913 2108.672     -0.6154804 0.1159946  -5.306114  1.119872e-07
##            padj
## 1  4.831750e-51
## 2  1.634784e-22
## 3 1.033590e-152
## 4  2.028932e-03
## 5  1.333208e-04
# Show quick plot of significance ()
DESeq2::plotMA(id3.mock.res, ylim = c(-3, 3))

CDK4/53 dropout

Oligo Id’s for the positive control dropout’s are: - oligo_1899 (MART1_ELA) - oligo_4281 (MART1_ELA) - oligo_912 (MART1) - oligo_3294 (MART1)

# Get average between the replicates
cdk4.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         CDK4_mean = rowMeans(select(., cdk453_a, cdk453_b))) %>% 
  mutate(M = log2(CDK4_mean / Mock_mean),
         A = (log2(CDK4_mean) + log2(Mock_mean)) / 2) 

# Plot scatterplot
cdk4.mean %>% 
  ggplot(aes(x = log10(Mock_mean), y = log10(CDK4_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("Mock normalized counts (log10)") +
  ylab("CDK4 normalized counts (log10)")

# Plot MA plot
cdk4.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(cdk4.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " CDK4/Mock)"))) 

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("mock_a", "mock_b", "cdk453_a", "cdk453_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "Mock")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Get results (CDK4 / Mock)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4759 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3, 0.063%
## LFC < 0 (down)     : 4, 0.084%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
cdk4.mock.res = results(ddsMat.subset, alpha = 0.05)

# View table
cdk4.mock.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##           Id  baseMean log2FoldChange      lfcSE       stat       pvalue
## 1 oligo_1887  772.9214     -2.8901569 0.21330745 -13.549254 8.003909e-42
## 2 oligo_1895 6742.4730     -0.3312007 0.07032309  -4.709700 2.480816e-06
## 3  oligo_199 1372.7466      0.7148753 0.15410883   4.638769 3.504898e-06
## 4 oligo_3925 7876.4527      0.2904682 0.06628197   4.382311 1.174271e-05
## 5 oligo_4269  620.7388     -4.4225899 0.32319890 -13.683803 1.268798e-42
## 6 oligo_4311 8468.5055      0.2476976 0.05916615   4.186475 2.833199e-05
## 7 oligo_4374 1934.2865     -0.6837166 0.12417351  -5.506139 3.667891e-08
##           padj
## 1 1.904530e-38
## 2 2.951551e-03
## 3 3.335962e-03
## 4 9.313924e-03
## 5 6.038211e-39
## 6 1.926170e-02
## 7 5.818497e-05
# Show quick plot of significance ()
DESeq2::plotMA(cdk4.mock.res)

MART1/CDK4 dropout

Oligo Id’s for the positive control dropout’s are: - oligo_1899 (MART1_ELA) - oligo_4281 (MART1_ELA) - oligo_912 (MART1) - oligo_3294 (MART1)

# Get average between the replicates
id3.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         CDK4_mean = rowMeans(select(., cdk453_a, cdk453_b)),
         ID3_mean = rowMeans(select(., id3_a, id3_b))) %>% 
  mutate(M = log2(ID3_mean / CDK4_mean),
         A = (log2(ID3_mean) + log2(CDK4_mean)) / 2) 

# Plot scatterplot
id3.mean %>% 
  ggplot(aes(x = log10(CDK4_mean), y = log10(ID3_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(id3.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(id3.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(id3.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(id3.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("CDK4 normalized counts (log10)") +
  ylab("1D3 normalized counts (log10)") 

# Plot MA plot
id3.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(id3.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(id3.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(id3.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(id3.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " 1D3/CDK4)"))) 

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("cdk453_a", "cdk453_b", "id3_a", "id3_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "CDK4")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Get results (ID3 / CDK4)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4750 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 4, 0.084%
## LFC < 0 (down)     : 5, 0.11%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
id3.cdk4.res = results(ddsMat.subset, alpha = 0.05)

# View table
id3.cdk4.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##           Id  baseMean log2FoldChange      lfcSE       stat        pvalue
## 1 oligo_1887  804.7891      2.9557084 0.19686979  15.013520  5.988381e-51
## 2 oligo_1895 6743.7982      0.3322023 0.06410123   5.182463  2.189751e-07
## 3 oligo_1899 1499.7890     -2.4666321 0.16309495 -15.123902  1.126636e-51
## 4 oligo_2645 2861.1253     -0.4344652 0.10762636  -4.036792  5.418701e-05
## 5 oligo_3295 2439.4898     -1.2441102 0.11006520 -11.303393  1.262446e-29
## 6 oligo_3425 2457.9234      0.4598083 0.10624364   4.327866  1.505608e-05
## 7 oligo_4269  730.6316      4.6676557 0.27483605  16.983419  1.089508e-64
## 8 oligo_4281 1793.0922     -4.1501970 0.15374555 -26.993932 1.741353e-160
## 9  oligo_913 2207.1302     -0.7225103 0.10961534  -6.591325  4.359196e-11
##            padj
## 1  7.111203e-48
## 2  1.485903e-04
## 3  1.783841e-48
## 4  2.859870e-02
## 5  1.199324e-26
## 6  8.939545e-03
## 7  2.587582e-61
## 8 8.271426e-157
## 9  3.451030e-08
# Show quick plot of significance ()
DESeq2::plotMA(id3.cdk4.res)

Dilutions 1:10

# Get average between the replicates
d10.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D10_mean = rowMeans(select(., d1_10_a, d1_10_b))) %>% 
  mutate(M = log2(D10_mean / Mock_mean),
         A = (log2(D10_mean) + log2(Mock_mean)) / 2) 

# Plot scatterplot
d10.scatter = d10.mean %>% 
  ggplot(aes(x = log10(Mock_mean), y = log10(D10_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d10.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d10.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d10.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d10.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("Mock normalized counts (log10)") +
  ylab("D10 normalized counts (log10)")
d10.scatter

# Plot MA plot
d10.ma = d10.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d10.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d10.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d10.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d10.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  ylim(-5, 5) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " D10/Mock)")))
d10.ma

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("mock_a", "mock_b", "d1_10_a", "d1_10_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "Mock")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Get results (D10 / Mock)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4759 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 7, 0.15%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
d10.mock.res = results(ddsMat.subset, alpha = 0.05)

# View table
d10.mock.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##           Id  baseMean log2FoldChange     lfcSE       stat        pvalue
## 1 oligo_1887  800.8005     -2.5072726 0.2217680 -11.305835  1.227801e-29
## 2 oligo_1895 6718.5792     -0.3428076 0.0764119  -4.486312  7.246673e-06
## 3 oligo_1899 1621.4895     -2.5555430 0.1588604 -16.086722  3.161408e-58
## 4 oligo_3295 2611.2699     -0.7461890 0.1170938  -6.372577  1.858780e-10
## 5 oligo_3425 2275.5100     -0.5964802 0.1221796  -4.881996  1.050174e-06
## 6 oligo_4269  624.8460     -4.2270058 0.2828515 -14.944259  1.697680e-50
## 7 oligo_4281 1735.4737     -4.3506053 0.1653703 -26.308258 1.542788e-152
##            padj
## 1  1.460776e-26
## 2  4.926702e-03
## 3  7.522572e-55
## 4  1.769187e-07
## 5  8.329633e-04
## 6  2.693086e-47
## 7 7.342126e-149
# Show quick plot of significance ()
DESeq2::plotMA(d10.mock.res)

Dilutions 1:100

# Get average between the replicates
d100.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D100_mean = rowMeans(select(., d1_100_a, d1_100_b))) %>% 
  mutate(M = log2(D100_mean / Mock_mean),
         A = (log2(D100_mean) + log2(Mock_mean)) / 2) 

# Plot scatterplot
d100.scatter = d100.mean %>% 
  ggplot(aes(x = log10(Mock_mean), y = log10(D100_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d100.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d100.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d100.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d100.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("Mock normalized counts (log10)") +
  ylab("D100 normalized counts (log10)")
d100.scatter

# Plot MA plot
d100.ma = d100.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d100.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d100.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d100.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d100.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  ylim(-5, 5) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " D100/Mock)")))
d100.ma

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("mock_a", "mock_b", "d1_100_a", "d1_100_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "Mock")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# Get results (D100 / Mock)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4763 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 9, 0.19%
## LFC < 0 (down)     : 13, 0.27%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
d100.mock.res = results(ddsMat.subset, alpha = 0.05)

# View table
d100.mock.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##            Id  baseMean log2FoldChange      lfcSE       stat        pvalue
## 1  oligo_1160 5517.9690      0.2744181 0.07135514   3.845807  1.201561e-04
## 2  oligo_1202 5002.0315     -0.2653457 0.06468634  -4.102036  4.095311e-05
## 3  oligo_1290  652.8729      0.7138063 0.18876227   3.781509  1.558804e-04
## 4  oligo_1416 2380.3230     -0.3554626 0.09489771  -3.745744  1.798597e-04
## 5  oligo_1608 1771.1818     -0.4636330 0.11913279  -3.891733  9.953072e-05
## 6  oligo_1787 3589.1703     -0.3045255 0.07448680  -4.088315  4.345174e-05
## 7  oligo_1887  837.8370     -2.1215428 0.18840763 -11.260387  2.058525e-29
## 8  oligo_1895 6822.2782     -0.2933034 0.06484763  -4.522962  6.098009e-06
## 9  oligo_1899 1685.6399     -2.2079337 0.12426590 -17.767816  1.254810e-70
## 10  oligo_199 1291.5294      0.5702755 0.13792906   4.134557  3.556401e-05
## 11 oligo_2684 2278.7900     -0.4067036 0.09600833  -4.236128  2.274074e-05
## 12 oligo_2837 5393.8335      0.3333689 0.07519533   4.433373  9.277030e-06
## 13 oligo_3340 2039.9874      0.4099799 0.10343249   3.963744  7.378337e-05
## 14 oligo_3383 1962.8270     -0.4430616 0.11081468  -3.998221  6.382037e-05
## 15 oligo_4071 1726.9442     -0.4747199 0.11520062  -4.120810  3.775425e-05
## 16 oligo_4269  716.7704     -2.2632358 0.53496867  -4.230595  2.330739e-05
## 17 oligo_4281 1916.8408     -2.6562865 0.11289935 -23.527917 2.113133e-122
## 18 oligo_4509 2842.9843      0.3258153 0.08605918   3.785945  1.531255e-04
## 19 oligo_4750 3009.7909      0.3492666 0.08266609   4.225029  2.389105e-05
## 20  oligo_504 2322.6317      0.4146858 0.09744284   4.255682  2.084125e-05
## 21  oligo_622 3325.7133     -0.2936877 0.07720321  -3.804086  1.423284e-04
## 22  oligo_779 2795.2172      0.3861536 0.09520987   4.055815  4.995977e-05
##             padj
## 1   3.179465e-02
## 2   1.592005e-02
## 3   3.535516e-02
## 4   3.893963e-02
## 5   2.788617e-02
## 6   1.592005e-02
## 7   3.268252e-26
## 8   7.261204e-03
## 9   2.988329e-67
## 10  1.592005e-02
## 11  1.264367e-02
## 12  8.837299e-03
## 13  2.196439e-02
## 14  2.026509e-02
## 15  1.592005e-02
## 16  1.264367e-02
## 17 1.006485e-118
## 18  3.535516e-02
## 19  1.264367e-02
## 20  1.264367e-02
## 21  3.535516e-02
## 22  1.699703e-02
# Show quick plot of significance ()
DESeq2::plotMA(d100.mock.res)

Dilutions 1:300

# Get average between the replicates
d300.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D300_mean = rowMeans(select(., d1_300_a, d1_300_b))) %>% 
  mutate(M = log2(D300_mean / Mock_mean),
         A = (log2(D300_mean) + log2(Mock_mean)) / 2) 

# Plot scatterplot
d300.scatter = d300.mean %>% 
  ggplot(aes(x = log10(Mock_mean), y = log10(D300_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d300.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d300.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d300.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d300.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("Mock normalized counts (log10)") +
  ylab("D300 normalized counts (log10)")
d300.scatter

# Plot MA plot
d300.ma = d300.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d300.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d300.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d300.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d300.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  ylim(-5, 5) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " D300/Mock)")))
d300.ma

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("mock_a", "mock_b", "d1_300_a", "d1_300_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "Mock")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# Get results (D300 / Mock)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4762 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 8, 0.17%
## LFC < 0 (down)     : 7, 0.15%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
d300.mock.res = results(ddsMat.subset, alpha = 0.05)

# View table
d300.mock.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##            Id baseMean log2FoldChange      lfcSE       stat       pvalue
## 1  oligo_1000 3066.379      0.3295100 0.08449434   3.899787 9.627717e-05
## 2  oligo_1008 5925.682     -0.2551563 0.06265047  -4.072696 4.647206e-05
## 3  oligo_1274 2022.303      0.4862157 0.10492878   4.633769 3.590678e-06
## 4  oligo_1757 2257.253      0.3634583 0.09566470   3.799293 1.451093e-04
## 5  oligo_1803 1840.728     -0.4749228 0.11740305  -4.045234 5.227089e-05
## 6  oligo_1887 1127.674     -0.6098858 0.14386865  -4.239185 2.243327e-05
## 7  oligo_1899 2044.892     -1.0719983 0.11441785  -9.369152 7.311744e-21
## 8   oligo_199 1295.388      0.5784940 0.14433208   4.008076 6.121535e-05
## 9  oligo_2948 4159.132     -0.2676614 0.06809504  -3.930703 8.469790e-05
## 10 oligo_3105 1757.511      0.4309627 0.11078074   3.890231 1.001487e-04
## 11 oligo_3272 1798.766      0.4108322 0.10728245   3.829445 1.284325e-04
## 12 oligo_4129 4365.094      0.2947775 0.07047939   4.182463 2.883673e-05
## 13 oligo_4281 2355.140     -1.2394346 0.10062181 -12.317753 7.269179e-35
## 14 oligo_4488 1962.145      0.3928846 0.10224162   3.842707 1.216846e-04
## 15 oligo_4557 2244.526     -0.3820882 0.09905877  -3.857187 1.146996e-04
##            padj
## 1  4.335530e-02
## 2  3.555914e-02
## 3  5.699603e-03
## 4  4.606735e-02
## 5  3.555914e-02
## 6  2.670681e-02
## 7  1.740926e-17
## 8  3.643844e-02
## 9  4.335530e-02
## 10 4.335530e-02
## 11 4.368539e-02
## 12 2.746411e-02
## 13 3.461583e-31
## 14 4.368539e-02
## 15 4.368539e-02
# Show quick plot of significance ()
DESeq2::plotMA(d300.mock.res)

Dilutions 1:1000

# Get average between the replicates
d1000.mean = cd8.norm %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D1000_mean = rowMeans(select(., d1_1000_a, d1_1000_b))) %>% 
  mutate(M = log2(D1000_mean / Mock_mean),
         A = (log2(D1000_mean) + log2(Mock_mean)) / 2) 

# Plot scatterplot
d1000.scatter = d1000.mean %>% 
  ggplot(aes(x = log10(Mock_mean), y = log10(D1000_mean))) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d1000.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d1000.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d1000.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d1000.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  xlab("Mock normalized counts (log10)") +
  ylab("D1000 normalized counts (log10)")
d1000.scatter

# Plot MA plot
d1000.ma = d1000.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(colour = alpha("red", 0.7), data = filter(d1000.mean, Id %in% c("oligo_4281", "oligo_1899"))) +
  geom_point(colour = alpha("orange", 0.7), data = filter(d1000.mean, Id %in% c("oligo_1888", "oligo_4270"))) +
  geom_point(colour = alpha("lightblue", 0.7), data = filter(d1000.mean, Id %in% c("oligo_913", "oligo_3295"))) +
  geom_point(colour = alpha("darkgreen", 0.7), data = filter(d1000.mean, Id %in% c("oligo_1887", "oligo_4269"))) +
  theme_light(base_size = 11) +
  ylim(-5, 5) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " D1000/Mock)")))
d1000.ma

Calculate statistics

# Create new DESeq2 object with poor samples removed
ddsMat.subset = ddsMat.cd8[,which(colnames(assay(ddsMat.cd8)) %in% c("mock_a", "mock_b", "d1_1000_a", "d1_1000_b"))]
ddsMat.subset$Group = droplevels(ddsMat.subset$Group)
ddsMat.subset$Group = relevel(ddsMat.subset$Group, "Mock")

# Run stats
ddsMat.subset <- DESeq(ddsMat.subset)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Get results (D1000 / Mock)
results(ddsMat.subset, alpha = 0.05) %>% 
  summary()
## 
## out of 4762 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 2, 0.042%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Get table of results
d1000.mock.res = results(ddsMat.subset, alpha = 0.05)

# View table
d1000.mock.res %>% 
  as.data.frame() %>% 
  rownames_to_column("Id") %>% 
  filter(padj < 0.05)
##           Id baseMean log2FoldChange     lfcSE      stat       pvalue
## 1  oligo_311 1883.486     -0.5499452 0.1252935 -4.389257 1.137384e-05
## 2 oligo_3425 2306.650     -0.5487480 0.1177323 -4.660981 3.147060e-06
##         padj
## 1 0.02708112
## 2 0.01498630
# Show quick plot of significance ()
DESeq2::plotMA(d1000.mock.res)

Plot serial dilutions together

# Scatter plot
d10.scatter + d100.scatter + d300.scatter + d1000.scatter 

# MA plot
d10.ma + d100.ma + d300.ma + d1000.ma

Filter low count oligos (visualization)

# Rank
cd8.norm.long %>% 
  group_by(Id) %>% 
  summarise(sum = sum(norm.counts),
            mean = mean(norm.counts),
            median = median(norm.counts),
            sd = sd(norm.counts),
            geomean = gm_mean(norm.counts),
            cv = sd(norm.counts) / mean(norm.counts))  %>% 
  mutate(rank = rank(mean)) %>% 
  ggplot(aes(x = rank, y = log10(mean))) +
  geom_point() +
  theme_minimal()

Plot average abundance

# Get mean counts per oligo
cd8.mean = cd8.norm.long %>% 
  group_by(Id) %>% 
  summarise(sum = sum(norm.counts),
            mean = mean(norm.counts),
            median = median(norm.counts),
            sd = sd(norm.counts),
            geomean = gm_mean(norm.counts),
            cv = sd(norm.counts) / mean(norm.counts)) 

# Get quantiles
quantile(cd8.mean$median , seq(from = 0, to = 0.10, by = 0.005))
##         0%       0.5%         1%       1.5%         2%       2.5%         3% 
##   0.000000   0.318799   1.062006   1.978373   9.955168  27.003683  74.638207 
##       3.5%         4%       4.5%         5%       5.5%         6%       6.5% 
## 142.062296 218.769373 319.850709 396.491177 445.067499 500.165253 551.465274 
##         7%       7.5%         8%       8.5%         9%       9.5%        10% 
## 592.921535 651.214172 689.947011 741.235201 767.365712 801.337192 831.117392
# Histogram with 5% cutoff
cd8.norm.long %>% 
  group_by(Id) %>% 
  summarise(total.count = sum(norm.counts),
            mean.count = mean(norm.counts),
            median = median(norm.counts),
            geomean = gm_mean(norm.counts),
            sd.count = sd(norm.counts)) %>% 
  ggplot(aes(x = log10(median))) +
  geom_histogram(binwidth = 0.25, colour = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "#FF6666") +
  geom_vline(xintercept = log10(396.491177), color = "red") +
  theme_light(base_size = 11) +
  xlab("Normalized reads per oligo (log10)")

Apply 5% cutoff

# Get ID's within the limits
## Cutoff = 5%
idx <- cd8.norm.long %>% 
  group_by(Id) %>% 
  summarise(sum = sum(norm.counts),
            mean = mean(norm.counts),
            median = median(norm.counts),
            geomean = gm_mean(norm.counts),
            sd = sd(norm.counts) ,
            c.v = sd/mean) %>% 
  filter(median >= 396.491177) %>% 
  pull(Id)

# Filter table
cd8.norm.fil = cd8.norm %>% 
  filter(Id %in% idx)

# Write filter table to disk
write_tsv(cd8.norm, "tables/cd8-counts-norm-filtered.tsv")

# Get dimensions
dim(cd8.norm)
## [1] 4764   15
dim(cd8.norm.fil)
## [1] 4525   15
# Create long dataframe
cd8.norm.fil.long = cd8.norm.fil %>% 
  gather(sample, norm.counts, -Id) 

# Plot boxplot
cd8.norm.fil.long %>% 
  ggplot(aes(x = sample, y = log10(norm.counts + 1))) +
  geom_boxplot() +
  theme_light(base_size = 11) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("") +
  ylab("Normalized counts (log10)") +
  ggtitle('Median-normalized data w/ filtering') 

# Histogram all
cd8.norm.fil.long %>% 
  ggplot(aes(x = log10(norm.counts))) +
  geom_histogram(aes(y=..density..), binwidth=0.25, colour="black", fill="white") +
  geom_density(alpha = 0.2, fill = "#FF6666") +
  facet_wrap(sample~.) +
  theme_light(base_size = 11) +
  ylab("Density") +
  xlab("(Filtered) Normalized reads per oligo (log10)")

MA plot (1D3/CDK4.53)

# Get average between the replicates
id3.mean = cd8.norm.fil %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         CDK4_mean = rowMeans(select(., cdk453_a, cdk453_b)),
         ID3_mean = rowMeans(select(., id3_a, id3_b))) %>% 
  mutate(M = log2(ID3_mean / CDK4_mean),
         A = (log2(ID3_mean) + log2(CDK4_mean)) / 2) 

# Plot MA plot
id3.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(size = 3.5, colour = rgb(237,157,148, max=255), data = filter(id3.mean, Id %in% c("oligo_4281", "oligo_1899"))) + #MART1-Ag
  geom_point(size = 3.5, colour = rgb(199,150,149, max=255), data = filter(id3.mean, Id %in% c("oligo_913", "oligo_3295"))) + #MART1-W
  geom_point(size = 3.5, colour = rgb(102,165,165, max=255), data = filter(id3.mean, Id %in% c("oligo_1887", "oligo_4269"))) + #CDK4-Ag
  geom_point(size = 3.5, colour = rgb(88,121,163, max=255), data = filter(id3.mean, Id %in% c("oligo_1888", "oligo_4270"))) + #CDK4-WT
  theme_light(base_size = 11) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab(expression(paste("Fold change ", '(', log[2], " 1D3/CDK4)"))) 

### Titration data

# 1:10
d10.mean = cd8.norm.fil %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D10_mean = rowMeans(select(., d1_10_a, d1_10_b))) %>% 
  mutate(M = log2(D10_mean / Mock_mean),
         A = (log2(D10_mean) + log2(Mock_mean)) / 2) 

d10.plot = d10.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(size = 4.5, colour = rgb(237,157,148, max=255), data = filter(d10.mean, Id %in% c("oligo_4281", "oligo_1899"))) + #MART1-Ag
  geom_point(size = 4.5, colour = rgb(199,150,149, max=255), data = filter(d10.mean, Id %in% c("oligo_913", "oligo_3295"))) + #MART1-W
  geom_point(size = 4.5, colour = rgb(102,165,165, max=255), data = filter(d10.mean, Id %in% c("oligo_1887", "oligo_4269"))) + #CDK4-Ag
  geom_point(size = 4.5, colour = rgb(88,121,163, max=255), data = filter(d10.mean, Id %in% c("oligo_1888", "oligo_4270"))) + #CDK4-WT
  theme_light(base_size = 11) +
  ylim(-5, 1) +
  xlim(7, 14) +
  xlab("") +
  ylab("")
d10.plot 

# 1:100
d100.mean = cd8.norm.fil %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D100_mean = rowMeans(select(., d1_100_a, d1_100_b))) %>% 
  mutate(M = log2(D100_mean / Mock_mean),
         A = (log2(D100_mean) + log2(Mock_mean)) / 2) 

d100.plot = d100.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(size = 4.5, colour = rgb(237,157,148, max=255), data = filter(d100.mean, Id %in% c("oligo_4281", "oligo_1899"))) + #MART1-Ag
  geom_point(size = 4.5, colour = rgb(199,150,149, max=255), data = filter(d100.mean, Id %in% c("oligo_913", "oligo_3295"))) + #MART1-W
  geom_point(size = 4.5, colour = rgb(102,165,165, max=255), data = filter(d100.mean, Id %in% c("oligo_1887", "oligo_4269"))) + #CDK4-Ag
  geom_point(size = 4.5, colour = rgb(88,121,163, max=255), data = filter(d100.mean, Id %in% c("oligo_1888", "oligo_4270"))) + #CDK4-WT
  theme_light(base_size = 11) +
  ylim(-5, 1) +
  xlim(7, 14) +
  xlab("") +
  ylab("")
d100.plot

# 1:300
d300.mean = cd8.norm.fil %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D300_mean = rowMeans(select(., d1_300_a, d1_300_b))) %>% 
  mutate(M = log2(D300_mean / Mock_mean),
         A = (log2(D300_mean) + log2(Mock_mean)) / 2) 

d300.plot = d300.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(size = 4.5, colour = rgb(237,157,148, max=255), data = filter(d300.mean, Id %in% c("oligo_4281", "oligo_1899"))) + #MART1-Ag
  geom_point(size = 4.5, colour = rgb(199,150,149, max=255), data = filter(d300.mean, Id %in% c("oligo_913", "oligo_3295"))) + #MART1-W
  geom_point(size = 4.5, colour = rgb(102,165,165, max=255), data = filter(d300.mean, Id %in% c("oligo_1887", "oligo_4269"))) + #CDK4-Ag
  geom_point(size = 4.5, colour = rgb(88,121,163, max=255), data = filter(d300.mean, Id %in% c("oligo_1888", "oligo_4270"))) + #CDK4-WT
  theme_light(base_size = 11) +
  ylim(-5, 1) +
  xlim(7, 14) +
  xlab("") +
  ylab(expression(paste("Fold change ", '(', log[2], " D300/Mock)"))) 
d300.plot

# 1:1000
d1000.mean = cd8.norm.fil %>% 
  mutate(Mock_mean = rowMeans(select(., mock_a, mock_b)),
         D1000_mean = rowMeans(select(., d1_1000_a, d1_1000_b))) %>% 
  mutate(M = log2(D1000_mean / Mock_mean),
         A = (log2(D1000_mean) + log2(Mock_mean)) / 2) 

d1000.plot = d1000.mean %>% 
  ggplot(aes(x = A, y = M)) +
  geom_point(colour = alpha("grey", 0.5)) +
  geom_point(size = 4.5, colour = rgb(237,157,148, max=255), data = filter(d1000.mean, Id %in% c("oligo_4281", "oligo_1899"))) + #MART1-Ag
  geom_point(size = 4.5, colour = rgb(199,150,149, max=255), data = filter(d1000.mean, Id %in% c("oligo_913", "oligo_3295"))) + #MART1-W
  geom_point(size = 4.5, colour = rgb(102,165,165, max=255), data = filter(d1000.mean, Id %in% c("oligo_1887", "oligo_4269"))) + #CDK4-Ag
  geom_point(size = 4.5, colour = rgb(88,121,163, max=255), data = filter(d1000.mean, Id %in% c("oligo_1888", "oligo_4270"))) + #CDK4-WT
  theme_light(base_size = 11) +
  ylim(-5, 1) +
  xlim(7, 14) +
  xlab(expression(paste("Mean normalized counts ", '(', log[2], ")"))) +
  ylab("")
d1000.plot

# Plot as single panel
d10.plot / d100.plot / d300.plot / d1000.plot

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0                forcats_0.5.1              
##  [3] stringr_1.4.0               dplyr_1.0.5                
##  [5] purrr_0.3.4                 readr_1.4.0                
##  [7] tidyr_1.1.3                 tibble_3.1.0               
##  [9] ggplot2_3.3.3               tidyverse_1.3.0            
## [11] readxl_1.3.1                DESeq2_1.30.1              
## [13] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [15] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [17] GenomicRanges_1.42.0        GenomeInfoDb_1.26.4        
## [19] IRanges_2.24.1              S4Vectors_0.28.1           
## [21] BiocGenerics_0.36.0         ggsci_2.9                  
## [23] patchwork_1.1.1             knitr_1.31                 
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           fs_1.5.0               lubridate_1.7.10      
##  [4] bit64_4.0.5            RColorBrewer_1.1-2     httr_1.4.2            
##  [7] tools_4.0.3            backports_1.2.1        bslib_0.2.4           
## [10] utf8_1.2.1             R6_2.5.0               DBI_1.1.1             
## [13] colorspace_2.0-0       withr_2.4.1            tidyselect_1.1.0      
## [16] curl_4.3               bit_4.0.4              compiler_4.0.3        
## [19] cli_2.3.1              rvest_1.0.0            xml2_1.3.2            
## [22] DelayedArray_0.16.3    labeling_0.4.2         sass_0.3.1            
## [25] scales_1.1.1           genefilter_1.72.1      digest_0.6.27         
## [28] foreign_0.8-80         rmarkdown_2.7          rio_0.5.26            
## [31] XVector_0.30.0         pkgconfig_2.0.3        htmltools_0.5.1.1     
## [34] highr_0.8              dbplyr_2.1.0           fastmap_1.1.0         
## [37] rlang_0.4.10           rstudioapi_0.13        RSQLite_2.2.5         
## [40] farver_2.1.0           jquerylib_0.1.3        generics_0.1.0        
## [43] jsonlite_1.7.2         BiocParallel_1.24.1    zip_2.1.1             
## [46] car_3.0-10             RCurl_1.98-1.3         magrittr_2.0.1        
## [49] GenomeInfoDbData_1.2.4 Matrix_1.2-18          Rcpp_1.0.6            
## [52] munsell_0.5.0          fansi_0.4.2            abind_1.4-5           
## [55] lifecycle_1.0.0        stringi_1.5.3          yaml_2.2.1            
## [58] carData_3.0-4          zlibbioc_1.36.0        grid_4.0.3            
## [61] blob_1.2.1             crayon_1.4.1           lattice_0.20-41       
## [64] haven_2.3.1            splines_4.0.3          annotate_1.68.0       
## [67] hms_1.0.0              locfit_1.5-9.4         pillar_1.5.1          
## [70] ggsignif_0.6.1         geneplotter_1.68.0     reprex_1.0.0          
## [73] XML_3.99-0.6           glue_1.4.2             evaluate_0.14         
## [76] data.table_1.14.0      modelr_0.1.8           vctrs_0.3.7           
## [79] cellranger_1.1.0       gtable_0.3.0           assertthat_0.2.1      
## [82] cachem_1.0.4           openxlsx_4.2.3         xfun_0.22             
## [85] xtable_1.8-4           broom_0.7.5            rstatix_0.7.0         
## [88] survival_3.2-7         AnnotationDbi_1.52.0   memoise_2.0.0         
## [91] ellipsis_0.3.1